In [1]:
%matplotlib inline
import datasets
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from IPython.display import Audio as ipAudio
import seaborn as sns
In [2]:
# loading datasets
def load_lang_df(lang):
    metadata_file = f"../data/{lang}_asr_corpus/data/train.jsonl"
    df = pd.read_json(metadata_file, lines=True, orient="records")
    df.name=f"{lang} dataset"
    return df

tamil_dataset = load_lang_df("tamil")
telugu_dataset = load_lang_df("telugu")
kannada_dataset = load_lang_df("kannada")
malayalam_dataset = load_lang_df("malayalam")
In [3]:
# total number of records
print(f"There are {tamil_dataset.shape[0]} records in the {tamil_dataset.name}")
print(f"There are {telugu_dataset.shape[0]} records in the {telugu_dataset.name}")
print(f"There are {kannada_dataset.shape[0]} records in the {kannada_dataset.name}")
print(f"There are {malayalam_dataset.shape[0]} records in the {malayalam_dataset.name}")
There are 682929 records in the tamil dataset
There are 209270 records in the telugu dataset
There are 172733 records in the kannada dataset
There are 6671 records in the malayalam dataset

Each datasets has different number of records. With the Tamil language having the highest number of records and the malayalam language having the least number of records.

In [4]:
# view sample of data from each dataset
display(tamil_dataset.head())
display(telugu_dataset.head())
display(kannada_dataset.head())
display(malayalam_dataset.head())
path sentence length
0 train/jQkLqTQW4HR3nqPswuTikn.mp3 அன்று முதல் இவர் என அழைக்க பட்டார் 3.0
1 train/4D8ijwiKyDo8XcMmxtyzLZ.mp3 அம்மா வேல பாக்குறாங்க அந்த சம்பளம் எண்ணோட இது... 3.0
2 train/VzXxG3jtnEDS6Ms5MFTr5J.mp3 திருப்பூர்ல இருந்து பேசறீங்க ஓகே சார் ஜாப் பண்... 3.0
3 train/dyuFe9oMCiGf8pEYTmKwKs.mp3 அதிகபட்சம் ரூ ஐம்பதாயிரத்திற்கும் மேல் டெபாசிட... 3.0
4 train/LZtuVLEpVYb8iGpQzrdxi4.mp3 நம்மள மாரி ஒரு நூறு பேரு கஸ்டமர் என்ன பண்ணிருப... 3.0
path sentence length
0 train/AbjyJhosm6fizLcjUFfXaT.mp3 దాన్ని సైనిక శక్తి ద్వారా తీసుకోలేరు 3.0
1 train/Lmei9y2siXzFECrPcfvF2A.mp3 మరి అన్న సిస్టం తో తలపడతారు 3.0
2 train/ZWfroY4Tg6Qe3ctQUdXYwv.mp3 విడదీయరాని అంతర్భాగమని నొక్కి చెప్పారు 3.0
3 train/SiHwZzoHAXRtxWb52suehy.mp3 మన దేశానికి చెందిన భజరంగ్ పూనియా 3.0
4 train/gkZwGDpQncY9t5qYDGKgpx.mp3 అప్పుడప్పుడు 3.0
path sentence length
0 train/jyEJg8nBjnbkoEneRffsYd.mp3 ಒಂದು ರೀತಿಯ ರಾಸಾಯನಿಕ ಉತ್ಪತ್ತಿಯಾಗುತ್ತದೆ 3.0
1 train/nKFq47ayVrjFdbwZHjfhXL.mp3 ವೈದ್ಯಕೀಯ ಆರೈಕೆಯಲ್ಲಿದ್ದಾರೆ ಎಂದು ತಿಳಿಸಿದರು 3.0
2 train/oPtqEqXDAHwNMdY5r8FJ4p.mp3 ವಿದ್ಯಾರ್ಥಿಗಳೇ ನಿಮ್ಮ ಉತ್ತರವನ್ನು ನೋಡೋಣ 3.0
3 train/Yn3bwtW3BVrx4fjjzdSUCH.mp3 ಕವನ ಸಂಕಲನ ಯಾವುದು ಮೊದಲ ಕವನ ಸಂಕಲನ ಯಾವುದು 3.0
4 train/SHTXDzpscMkR6HBpXjZDYF.mp3 ಶಿವಮೊಗ್ಗದಲ್ಲಿ ಇಂದು ಸುದ್ದಿಗಾರರ ಜೊತೆ ಮಾತನಾಡಿದ ಅವರು 3.0
path sentence length
0 train/i29JGozoiWEkqKrzAL4Dd5.mp3 ഭയാനക പ്രതിസന്ധി 3.0
1 train/iGzPqKPmLBDzYbjEfkqeqR.mp3 പ്രകാശം എപ്പോഴും നല്ലത് ആണല്ലോ 3.0
2 train/Ze7U6rvcC8gqRiJY8uvxWC.mp3 ഇപ്പോൾ ഉദാഹരണമായിട്ട് നമ്മൾ രാവിലെ 3.0
3 train/MYoRSKzKXWc6uf2FfAS3Uy.mp3 ടൂറിസം കേന്ദ്രം ഓൺലൈൻ വയനാട് 3.0
4 train/XgiGQi8ECz8AELKaN5UC2A.mp3 ഈ പ്രവചിക്കാവുന്ന കടകം 3.0

As seen each dataset has three columns. The path column relating to the relative audio path, the sentence column contains the transcript and length column contains the length of the audio in seconds

In [5]:
# Statistics for the Audio Duration
def display_audio_stats(df):
    audio_stats = pd.DataFrame(df["length"].map(lambda x: x/3600).describe()).T
    print(f"Audio Statistics in Hours for {df.name}")
    display(audio_stats)
In [6]:
display_audio_stats(tamil_dataset)
display_audio_stats(telugu_dataset)
display_audio_stats(kannada_dataset)
display_audio_stats(malayalam_dataset)
Audio Statistics in Hours for tamil dataset
count mean std min 25% 50% 75% max
length 682929.0 0.001894 0.000819 0.000833 0.001256 0.001675 0.00237 0.008325
Audio Statistics in Hours for telugu dataset
count mean std min 25% 50% 75% max
length 209270.0 0.00185 0.000835 0.000833 0.001175 0.001608 0.002367 0.004167
Audio Statistics in Hours for kannada dataset
count mean std min 25% 50% 75% max
length 172733.0 0.002077 0.000893 0.000833 0.001333 0.001883 0.002717 0.007443
Audio Statistics in Hours for malayalam dataset
count mean std min 25% 50% 75% max
length 6671.0 0.001518 0.000642 0.000833 0.00105 0.001333 0.001742 0.004167
In [7]:
def display_audio_hists(dfs):
    fig, axs= plt.subplots(ncols=2,nrows=2, figsize=(20, 15))
    axs = axs.ravel()
    
    for i, df in enumerate(dfs):
#         audio_hours = df["length"]
#         audio_hours.plot.hist(bins=50, ax=axs[i])
        sns.histplot(data=df, x="length", kde=True, ax=axs[i])
        sns.despine(top=True, right=True)
        axs[i].set_xlabel("Duration in Seconds")
        axs[i].set_title(f"Histogram of audio duation for {df.name}", size=14)
    plt.show()
In [8]:
display_audio_hists(
    [tamil_dataset,telugu_dataset,
     kannada_dataset,malayalam_dataset
    ]
)

All the datasets are mostly left skewed in their distributions with most of the data contain short audio of less than 15 seconds. This will be useful when we truncate and pad our data for the processing through our model.

In [9]:
def plot_audio_violins(datasets):
    item_dfs = []
    for dataset in datasets:
        item_df = pd.DataFrame(dataset["length"].tolist(), columns=["duration(s)"])
        item_df["dataset"] = dataset.name
        item_dfs.append(item_df)
    item_dfs = pd.concat(item_dfs, axis=0)
    plt.figure(figsize=(18, 10))
    sns.violinplot(data=item_dfs, y="duration(s)", x="dataset")
    plt.title("Distribution of Audio duration across datasets", size=14)
    sns.despine(top=True, right=True)
    plt.show()
In [10]:
plot_audio_violins([tamil_dataset,telugu_dataset,
     kannada_dataset,malayalam_dataset
    ]
)
In [11]:
# Statistics for the Audio Duration
def display_transcription_stats(df):
    char_stats = pd.DataFrame(df["sentence"].map(len).describe()).T
    char_stats.index=["Character"]
    word_stats = pd.DataFrame(df["sentence"].str.split().map(len).describe()).T
    word_stats.index=["Word"]
    print(f"Transcription Statistics for {df.name}")
    stats = pd.concat([char_stats, word_stats])
    display(stats)
In [12]:
display_transcription_stats(tamil_dataset)
display_transcription_stats(telugu_dataset)
display_transcription_stats(kannada_dataset)
display_transcription_stats(malayalam_dataset)
Transcription Statistics for tamil dataset
count mean std min 25% 50% 75% max
Character 682929.0 103.259850 54.817564 2.0 64.0 88.0 132.0 524.0
Word 682929.0 11.741752 6.047904 1.0 7.0 10.0 15.0 62.0
Transcription Statistics for telugu dataset
count mean std min 25% 50% 75% max
Character 209270.0 81.527051 44.311727 4.0 49.0 72.0 107.0 289.0
Word 209270.0 10.109251 5.297302 1.0 6.0 9.0 13.0 36.0
Transcription Statistics for kannada dataset
count mean std min 25% 50% 75% max
Character 172733.0 75.106627 42.335828 2.0 44.0 68.0 100.0 249.0
Word 172733.0 9.217712 5.084692 1.0 5.0 8.0 12.0 31.0
Transcription Statistics for malayalam dataset
count mean std min 25% 50% 75% max
Character 6671.0 71.999550 39.473064 4.0 48.0 64.0 85.0 303.0
Word 6671.0 7.987258 4.484889 1.0 5.0 7.0 9.0 36.0
In [13]:
def display_transcription_hists(dfs, stat_type="word"):
    fig, axs= plt.subplots(ncols=2,nrows=2, figsize=(20, 15))
    axs = axs.ravel()
    for i, df in enumerate(dfs):
        df_copy = df.copy()
        if stat_type == "word":
            df_copy["t_length"] = df_copy["sentence"].str.split().map(len)
        else:
            df_copy["t_length"] = df_copy["sentence"].map(len)
        sns.histplot(data=df_copy, x="t_length", kde=True, ax=axs[i])
        sns.despine(top=True, right=True)
        axs[i].set_xlabel(f"Length of transcription in {stat_type}s")
        axs[i].set_title(f"Histogram of transcription length for {df.name}", size=14)
    plt.show()
In [14]:
display_transcription_hists((tamil_dataset,telugu_dataset,
     kannada_dataset,malayalam_dataset))
display_transcription_hists((tamil_dataset,telugu_dataset,
     kannada_dataset,malayalam_dataset), stat_type="character")
In [15]:
def plot_transcript_violins(datasets, stat_type="word"):
    item_dfs = []
    for dataset in datasets:
        dataset_copy = dataset.copy()
        if stat_type == "word":
            dataset_copy["t_length"] = dataset_copy["sentence"].str.split().map(len)
        else:
            dataset_copy["t_length"] = dataset_copy["sentence"].map(len)
        item_df = pd.DataFrame(dataset_copy["t_length"].tolist(), columns=[f"Transcript Length({stat_type})"])
        item_df["dataset"] = dataset.name
        item_dfs.append(item_df)
    item_dfs = pd.concat(item_dfs, axis=0)
    plt.figure(figsize=(18, 10))
    sns.violinplot(data=item_dfs, y=f"Transcript Length({stat_type})", x="dataset")
    plt.title("Distribution of Transcript lengths duration across datasets", size=14)
    sns.despine(top=True, right=True)
    plt.show()
In [16]:
plot_transcript_violins((tamil_dataset,telugu_dataset,
     kannada_dataset,malayalam_dataset))
plot_transcript_violins((tamil_dataset,telugu_dataset,
     kannada_dataset,malayalam_dataset), "characters")
In [17]:
# Audio and Wav forms
def plot_sample_audio_from_dataset(df):
    lang = df.name.split()[0]
    dataset_dir = f"../data/{lang}_asr_corpus/"
    dataset = datasets.load_dataset(dataset_dir, split="train")
    dataset = dataset.select(range(4))
    fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(20,10))
    ax = ax.ravel()
    for i, item in enumerate(dataset):
        display(
            ipAudio(item["audio"]["array"], rate=item["audio"]["sampling_rate"]),
            metadata={"transcript": item["sentence"]})
        ax[i].plot(range(len(item["audio"]["array"])), item["audio"]["array"])
        ax[i].set_xlabel("Time")
        ax[i].set_ylabel("Amplitude")
    plt.suptitle(f"Waveforms of samples from {df.name}", size=14)
    sns.despine(top=True, right=True)
    plt.show()
In [18]:
for dataset in (tamil_dataset, malayalam_dataset, telugu_dataset, kannada_dataset):
    plot_sample_audio_from_dataset(dataset)
Found cached dataset tamil_asr_corpus (/media/mugan/data/.cache/huggingface/tamil_asr_corpus/default/1.1.0/f76ac20a4e5d50f03062059f7e12e4b59211896d543edba0ab2503b8e74996b0)
Your browser does not support the audio element.
Your browser does not support the audio element.
Your browser does not support the audio element.
Your browser does not support the audio element.
Found cached dataset malayalam_asr_corpus (/media/mugan/data/.cache/huggingface/malayalam_asr_corpus/default/1.1.0/9f936af6b0d95b92954b2a4c3351393722f0452db11cd15bb0bc3374c54bdd77)
Your browser does not support the audio element.
Your browser does not support the audio element.
Your browser does not support the audio element.
Your browser does not support the audio element.
Found cached dataset telugu_asr_corpus (/media/mugan/data/.cache/huggingface/telugu_asr_corpus/default/1.1.0/b107eea8e2aaf2216292228368c5a07b47e307fa0e08fd77dd0cf113f72c87dc)
Your browser does not support the audio element.
Your browser does not support the audio element.
Your browser does not support the audio element.
Your browser does not support the audio element.
Found cached dataset kannada_asr_corpus (/media/mugan/data/.cache/huggingface/kannada_asr_corpus/default/1.1.0/de4ccc7ad1b5663213e62c29c696d3a848d9b3596c46be431a03d3649d14f8a7)
Your browser does not support the audio element.
Your browser does not support the audio element.
Your browser does not support the audio element.
Your browser does not support the audio element.